import the data
setwd("/Users/ruizhipu/Desktop")
df = read.csv("stock_price.csv",header = TRUE)
df = t(df)
dimension of the data
dim(df)
## [1] 30 127
pcastock = prcomp(df,scale= TRUE)
#pcastock
summary(pcastock)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 10.3315 2.4535 1.7092 1.5615 1.36449 1.10036 1.0391
## Proportion of Variance 0.8405 0.0474 0.0230 0.0192 0.01466 0.00953 0.0085
## Cumulative Proportion 0.8405 0.8879 0.9109 0.9301 0.94472 0.95426 0.9628
## PC8 PC9 PC10 PC11 PC12 PC13
## Standard deviation 0.93168 0.83322 0.76973 0.69701 0.63281 0.57939
## Proportion of Variance 0.00683 0.00547 0.00467 0.00383 0.00315 0.00264
## Cumulative Proportion 0.96959 0.97506 0.97973 0.98355 0.98670 0.98935
## PC14 PC15 PC16 PC17 PC18 PC19
## Standard deviation 0.56076 0.52209 0.44984 0.36731 0.34632 0.33101
## Proportion of Variance 0.00248 0.00215 0.00159 0.00106 0.00094 0.00086
## Cumulative Proportion 0.99182 0.99397 0.99556 0.99663 0.99757 0.99843
## PC20 PC21 PC22 PC23 PC24 PC25
## Standard deviation 0.27238 0.2256 0.1604 0.13618 0.12342 0.09085
## Proportion of Variance 0.00058 0.0004 0.0002 0.00015 0.00012 0.00006
## Cumulative Proportion 0.99902 0.9994 0.9996 0.99977 0.99989 0.99995
## PC26 PC27 PC28 PC29 PC30
## Standard deviation 0.06140 0.03348 0.02853 0.02202 1.046e-15
## Proportion of Variance 0.00003 0.00001 0.00001 0.00000 0.000e+00
## Cumulative Proportion 0.99998 0.99999 1.00000 1.00000 1.000e+00
#the loadings
#pcastock$rotation
stockpc = predict(pcastock)
plot(pcastock, main="screenplot")
mtext(side=1, "Stock Price Principal Components", line=1, font=2)
So, maybe we can choose the PC1 as the principle component
plot(x = c(1:30),y = stockpc[,1], xlab = "index of the stock", ylab = "loadings of the PC1",main = "loadings for first principal componen")
plot(stockpc[,1:2], type="n",main = "first two principal components")
text(x=stockpc[,1], y=stockpc[,2], labels=row.names(df))
##Generate an MDS map to plot stocks on a two-dimensional space.
#set.seed(851982)
stock.dis = dist(df)
stock.mds <- cmdscale(stock.dis)
plot(stock.mds,type = 'n',main = "MDS map")
text(stock.mds, labels=row.names(df))
##Use k-means and hierarchical clustering to group stocks. Specifically, you will generate 8 MDS maps for the stocks and color the stocks based on different clustering methods (k-means, h-clustering with single-link, h-clustering with complete-link, h-clustering with average-link) and different number of clusters (k = 3, k = 6). For each hierarchical clustering method, generate a dendrogram. ##hint: Standardize the data before performing clustering
k-means k = 3
set.seed(1)
knnstock = kmeans(df, centers=3, nstart=10)
#knnstock
o=order(knnstock$cluster)
dfk3 = data.frame(row.names(df)[o],knnstock$cluster[o])
#dfk3
plot(stock.mds,type = 'n',main = "k-means k = 3")
text(stock.mds, labels=row.names(df),col = knnstock$cluster+1)
k-means k = 6
set.seed(1)
knnstock = kmeans(df, centers=6, nstart=10)
#knnstock
o=order(knnstock$cluster)
dfk6 = data.frame(row.names(df)[o],knnstock$cluster[o])
#dfk6
plot(stock.mds,type = 'n',main = "k-means k = 6")
text(stock.mds, labels=row.names(df),col = knnstock$cluster+1)
library(cluster)
h-clustering with single-link
df2 = scale(df)
stock2agg = agnes(df2,diss = FALSE,metric="euclidian",stand = TRUE,method = "single")
stock2agg
## Call: agnes(x = df2, diss = FALSE, metric = "euclidian", stand = TRUE, method = "single")
## Agglomerative coefficient: 0.7523622
## Order of objects:
## [1] AA BAC GE PFE INTC KO T MSFT CSCO VZ MRK HD DIS JPM
## [15] AXP JNJ PG TRV WMT KRFT MCD DD HPQ BA MMM UTX CVX XOM
## [29] CAT IBM
## Height (summary):
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.656 4.443 7.091 7.857 7.971 30.602
##
## Available components:
## [1] "order" "height" "ac" "merge" "diss" "call"
## [7] "method" "order.lab" "data"
data.dis = dist(df)
hcstock = hclust(data.dis,method='single')
hcstock1 = cutree(hcstock,k=3)
print(hcstock1)
## AA AXP BA BAC CAT CSCO CVX DD DIS GE HD HPQ IBM INTC JNJ
## 1 1 1 1 2 1 2 1 1 1 1 1 3 1 1
## JPM KRFT KO MCD MMM MRK MSFT PFE PG T TRV UTX VZ WMT XOM
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
dendrogram single-link
plot(stock2agg, which.plots=2)
MDS plot
plot(stock.mds,type = 'n')
text(stock.mds, labels=row.names(df), col = hcstock1+1)
h-clustering with complete-link
stock2agg = agnes(df2,diss = FALSE,metric="euclidian",stand = TRUE,method = "complete")
stock2agg
## Call: agnes(x = df2, diss = FALSE, metric = "euclidian", stand = TRUE, method = "complete")
## Agglomerative coefficient: 0.8528047
## Order of objects:
## [1] AA BAC GE PFE CSCO INTC MSFT AXP DIS JPM HD KO T VZ
## [15] MRK HPQ DD TRV WMT BA JNJ PG KRFT MCD MMM UTX CAT CVX
## [29] XOM IBM
## Height (summary):
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.656 5.926 7.848 12.116 11.863 61.404
##
## Available components:
## [1] "order" "height" "ac" "merge" "diss" "call"
## [7] "method" "order.lab" "data"
data.dis = dist(df)
hcstock = hclust(data.dis,method='complete')
hcstock = cutree(hcstock,k=3)
print(hcstock)
## AA AXP BA BAC CAT CSCO CVX DD DIS GE HD HPQ IBM INTC JNJ
## 1 2 2 1 2 1 2 2 2 1 1 2 3 1 2
## JPM KRFT KO MCD MMM MRK MSFT PFE PG T TRV UTX VZ WMT XOM
## 2 2 1 2 2 1 1 1 2 1 2 2 1 2 2
dendrogram complete-link
plot(stock2agg, which.plots=2)
MDS plot
plot(stock.mds,type = 'n')
text(stock.mds, labels=row.names(df), col = hcstock+1)
h-clustering with average-link
stock2agg = agnes(df2,diss = FALSE,metric="euclidian",stand = TRUE,method = "average")
stock2agg
## Call: agnes(x = df2, diss = FALSE, metric = "euclidian", stand = TRUE, method = "average")
## Agglomerative coefficient: 0.8142313
## Order of objects:
## [1] AA BAC CSCO GE PFE INTC MSFT HD KO T VZ MRK AXP DIS
## [15] JPM DD TRV WMT HPQ BA MMM UTX JNJ PG KRFT MCD CVX XOM
## [29] CAT IBM
## Height (summary):
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.656 4.856 7.676 10.128 11.166 47.382
##
## Available components:
## [1] "order" "height" "ac" "merge" "diss" "call"
## [7] "method" "order.lab" "data"
data.dis = dist(df)
hcstock = hclust(data.dis,method='average')
hcstock = cutree(hcstock,k=3)
print(hcstock)
## AA AXP BA BAC CAT CSCO CVX DD DIS GE HD HPQ IBM INTC JNJ
## 1 1 2 1 2 1 2 2 1 1 1 1 3 1 2
## JPM KRFT KO MCD MMM MRK MSFT PFE PG T TRV UTX VZ WMT XOM
## 1 2 1 2 2 1 1 1 2 1 2 2 1 2 2
dendrogram average-link
plot(stock2agg, which.plots=2)
MDS plot
plot(stock.mds,type = 'n')
text(stock.mds, labels=row.names(df), col = hcstock+1)
##k = 6
h-clustering with single-link
df2 = df
stock2agg = agnes(df2,diss = FALSE,metric="euclidian",stand = TRUE,method = "single")
stock2agg
## Call: agnes(x = df2, diss = FALSE, metric = "euclidian", stand = TRUE, method = "single")
## Agglomerative coefficient: 0.7523622
## Order of objects:
## [1] AA BAC GE PFE INTC KO T MSFT CSCO VZ MRK HD DIS JPM
## [15] AXP JNJ PG TRV WMT KRFT MCD DD HPQ BA MMM UTX CVX XOM
## [29] CAT IBM
## Height (summary):
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.656 4.443 7.091 7.857 7.971 30.602
##
## Available components:
## [1] "order" "height" "ac" "merge" "diss" "call"
## [7] "method" "order.lab" "data"
data.dis = dist(df)
hcstock = hclust(data.dis,method='single')
hcstock = cutree(hcstock,k=6)
print(hcstock)
## AA AXP BA BAC CAT CSCO CVX DD DIS GE HD HPQ IBM INTC JNJ
## 1 1 2 1 3 1 3 4 1 1 1 1 5 1 4
## JPM KRFT KO MCD MMM MRK MSFT PFE PG T TRV UTX VZ WMT XOM
## 1 4 1 2 6 1 1 1 4 1 4 2 1 4 2
dendrogram single-link
plot(stock2agg, which.plots=2)
MDS plot
plot(stock.mds,type = 'n')
text(stock.mds, labels=row.names(df), col = hcstock+1)
h-clustering with complete-link
stock2agg = agnes(df2,diss = FALSE,metric="euclidian",stand = TRUE,method = "complete")
stock2agg
## Call: agnes(x = df2, diss = FALSE, metric = "euclidian", stand = TRUE, method = "complete")
## Agglomerative coefficient: 0.8528047
## Order of objects:
## [1] AA BAC GE PFE CSCO INTC MSFT AXP DIS JPM HD KO T VZ
## [15] MRK HPQ DD TRV WMT BA JNJ PG KRFT MCD MMM UTX CAT CVX
## [29] XOM IBM
## Height (summary):
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.656 5.926 7.848 12.116 11.863 61.404
##
## Available components:
## [1] "order" "height" "ac" "merge" "diss" "call"
## [7] "method" "order.lab" "data"
data.dis = dist(df)
hcstock = hclust(data.dis,method='complete')
hcstock = cutree(hcstock,k=6)
print(hcstock)
## AA AXP BA BAC CAT CSCO CVX DD DIS GE HD HPQ IBM INTC JNJ
## 1 2 3 1 4 1 4 2 2 1 5 2 6 1 3
## JPM KRFT KO MCD MMM MRK MSFT PFE PG T TRV UTX VZ WMT XOM
## 2 3 5 3 4 5 5 1 3 5 3 4 5 2 4
dendrogram complete-link
plot(stock2agg, which.plots=2)
MDS plot
plot(stock.mds,type = 'n')
text(stock.mds, labels=row.names(df), col = hcstock+1)
h-clustering with average-link
stock2agg = agnes(df2,diss = FALSE,metric="euclidian",stand = TRUE,method = "average")
stock2agg
## Call: agnes(x = df2, diss = FALSE, metric = "euclidian", stand = TRUE, method = "average")
## Agglomerative coefficient: 0.8142313
## Order of objects:
## [1] AA BAC CSCO GE PFE INTC MSFT HD KO T VZ MRK AXP DIS
## [15] JPM DD TRV WMT HPQ BA MMM UTX JNJ PG KRFT MCD CVX XOM
## [29] CAT IBM
## Height (summary):
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.656 4.856 7.676 10.128 11.166 47.382
##
## Available components:
## [1] "order" "height" "ac" "merge" "diss" "call"
## [7] "method" "order.lab" "data"
data.dis = dist(df)
hcstock = hclust(data.dis,method='average')
hcstock = cutree(hcstock,k=6)
print(hcstock)
## AA AXP BA BAC CAT CSCO CVX DD DIS GE HD HPQ IBM INTC JNJ
## 1 2 3 1 4 1 4 5 2 1 2 2 6 1 5
## JPM KRFT KO MCD MMM MRK MSFT PFE PG T TRV UTX VZ WMT XOM
## 2 5 2 3 4 2 2 1 5 2 5 3 2 5 3
dendrogram average-link
plot(stock2agg, which.plots=2)
MDS plot
plot(stock.mds,type = 'n')
text(stock.mds, labels=row.names(df), col = hcstock+1)
library(foreign)
dfvt = read.dta("sen113kh.dta")
df10086 = subset(dfvt, state < 99)
dfvote = df10086[,-c(1:9)]
the distance matrix is ex.vote
ex.vote = as.matrix(dfvote) %*% t(dfvote)
#ex.dist = dist(ex.vote)
#ex.mds = cmdscale(ex.dist)
#plot(ex.mds, type = 'n')
#text(ex.mds,dt[,9],col = dt[,6]+1)
no.pres <- subset(dfvt, state < 99)
## to group all Yea and Nay types together
for(i in 10:ncol(no.pres)) {
no.pres[,i] = ifelse(no.pres[,i] > 6, 0, no.pres[,i])
no.pres[,i] = ifelse(no.pres[,i] > 0 & no.pres[,i] < 4, 1, no.pres[,i])
no.pres[,i] = ifelse(no.pres[,i] > 1, -1, no.pres[,i])
}
dt <- as.matrix(no.pres[,10:ncol(no.pres)])
library(ggplot2)
rollcall.dist = dist(dt)
rollcall.mds = cmdscale(rollcall.dist)
congress = subset(dfvt, state < 99)
congress.names = sapply(as.character(congress$name), function(n) strsplit(n, "[, ]")[[1]][1])
rollcall.mds = transform(rollcall.mds,
name = congress.names,
party = as.factor(congress$party))
mds = ggplot(rollcall.mds, aes(x = X1, y = X2)) +
scale_alpha(guide="none") + theme_bw() +
theme(axis.ticks = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank()) +
xlab("") +
ylab("") +
scale_shape(name = "party", breaks = c("100", "200", "328"),
labels = c("Dem.", "Rep.", "Ind."), solid = FALSE) +
scale_color_manual(name = "party", values = c("100" = "blue",
"200" = "red",
"328"="green"),
breaks = c("100", "200", "328"),
labels = c("Dem.", "Rep.", "Ind.")
)
print(mds + geom_point(aes(shape = party,
color = party,
alpha = 0.75),size=3))
ex.mds = rollcall.mds
print(mds + geom_text(aes(color = party,
alpha = 0.75,
label = rollcall.mds$name),size=3))
k-means , k =2
set.seed(1)
dfvote = scale(dt)
knnvote = kmeans(dfvote,centers = 2,nstart = 10)
plot(x = ex.mds[,1], y = ex.mds[,2],type = 'n',main = "k-means , k =2")
text(x = ex.mds[,1], y = ex.mds[,2],labels = rollcall.mds$name,col = knnvote$cluster+1)
h-clustering with single-link
data.dis = dist(dt)
hcstock1 = hclust(data.dis,method='single')
hccut1 = cutree(hcstock1,k=2)
print(hcstock1)
##
## Call:
## hclust(d = data.dis, method = "single")
##
## Cluster method : single
## Distance : euclidean
## Number of objects: 105
plot(x = ex.mds[,1], y = ex.mds[,2], type = 'n',main="h-clustering with single-link, k=2")
text(x = ex.mds[,1], y = ex.mds[,2],labels = ex.mds$name,col = hccut1+1)
h-clustering with complete-link
data.dis = dist(dt)
hcstock1 = hclust(data.dis,method='complete')
hccut2 = cutree(hcstock1,k=2)
print(hccut2)
## 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
## 1 1 2 2 1 1 2 1 2 2 2 2 2 2 2 2 1 2
## 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
## 1 1 2 2 1 1 2 1 1 2 1 2 1 1 1 1 1 2
## 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
## 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 1 2 2
## 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
## 2 1 1 1 2 1 2 2 2 2 2 2 2 2 2 1 2 2
## 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
## 1 2 1 1 1 2 2 2 1 2 2 1 1 1 2 1 1 1
## 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
## 1 1 1 2 2 2 2 2 2 2 2 1 2 1 1
plot(x = ex.mds[,1], y = ex.mds[,2], type = 'n',main="h-clustering with complete-link, k=2")
text(x = ex.mds[,1], y = ex.mds[,2],labels = ex.mds$name,col = hccut2+1)
#data.dis = dist(dfvote)
#hcstock2 = hclust(data.dis,method='complete')
#hcstock2 = cutree(hcstock2,k=2)
#print(hcstock2)
#plot(ex.mds[,1:2], type = 'n')
#text(ex.mds[,1:2],dt[,9],col = hcstock2+1)
h-clustering with average-link
#data.dis = dist(dfvote)
#hcstock3 = hclust(data.dis,method='average')
dt = scale(dt)
hcvote = agnes(dt,diss = FALSE,metric = "euclidian",method = "average")
hccut3 = cutree(hcvote,k=2)
print(hccut3)
## [1] 1 1 2 2 1 1 2 1 2 2 2 2 2 2 2 2 1 2 1 1 2 2 1 1 2 1 1 2 1 2 1 1 1 1 1
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 1 2 2 2 1 1 1 2 1 2 2 2 2 2 2 2 2 2 1
## [71] 2 2 1 2 1 1 1 2 2 2 1 2 2 1 1 1 2 1 1 1 1 1 1 2 2 2 2 2 2 2 2 1 2 1 1
#plot(hcstock3,which.plots = 2)
#plot(ex.mds[,1:2], type = 'n')
#text(ex.mds[,1:2],dt[,9],col = hcstock3+1)
plot(x = ex.mds[,1], y = ex.mds[,2], type = 'n',main="h-clustering with average-link, k=2")
text(x = ex.mds[,1], y = ex.mds[,2],labels = ex.mds$name,col = hccut3+1)
As we can see from the picture, for k means, the Mr. KERRY, COLLINS,MURKOWSKI, LAUTENBERG are wrongly assigned as Democrat; for h-clustering with single-link ,most of the Democrats are assigned as Republicans; for h-clustering with complete-link, Mr. MURKOWSKI, COLLINS, CHIESA,are wrongly assigned as Democrats, for h-clustering with average-link, Mr. MURKOWSKI, COLLINS, CHIESA,are wrongly assigned as Democrats.
cluster.purity <- function(clusters, classes) {
sum(apply(table(classes, clusters), 2, max)) / length(clusters)
}
cluster.entropy <- function(clusters,classes) {
en <- function(x) {
s = sum(x)
sum(sapply(x/s, function(p) {if (p) -p*log2(p) else 0} ) )
}
M = table(classes, clusters)
m = apply(M, 2, en)
c = colSums(M) / sum(M)
sum(m*c)
}
cluster2 = as.vector(knnvote$cluster)
class = rollcall.mds$party
kmp = cluster.purity(cluster2,class)
kme = cluster.entropy(cluster2,class)
cluster2 = as.vector(hccut1)
sp = cluster.purity(cluster2,class)
se = cluster.entropy(cluster2,class)
cluster2 = as.vector(hccut2)
ce = cluster.purity(cluster2,class)
cp = cluster.entropy(cluster2,class)
cluster2 = as.vector(hccut3)
ae = cluster.purity(cluster2,class)
ap = cluster.entropy(cluster2,class)
table = matrix(c(kmp,kme,sp,se,ce,cp,ae,ap),ncol= 4,byrow = FALSE)
colnames(table) <- c("k-means","hclust-single","hclust-complete","hclust-average")
rownames(table) <- c("Purity","Entropy")
table <- as.table(table)
table
## k-means hclust-single hclust-complete hclust-average
## Purity 0.9428571 0.5619048 0.9523810 0.9523810
## Entropy 0.3520954 1.0859032 0.2850529 0.2850529
H-clustering with complete-link and h-clust-average are the best two that generate the most meaningful results, because we can see from the picture that they have the least mis-classified results and also , they have the high Purity (h-clust-complet&kmeans) and the low Entropy(k-means) value. Maybe the pair-wise distances between the data points is a better choice for the distance calculation for the clusters and data are connected in pairs.And thus also kmeans may not be able to form the best clusters, the wrong assigned data in the single link model may have “chain effects of forming the cluster because of it is sensitive to noise in the data.